{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import copy\n",
    "import pandas as pd\n",
    "import csv\n",
    "import json\n",
    "from lxml import etree\n",
    "import os\n",
    "from tqdm import tqdm\n",
    "from collections import defaultdict\n",
    "import statistics\n",
    "import shutil\n",
    "from azure.storage.blob import BlobServiceClient\n",
    "from eMammal_helpers import *"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# print all outputs in a cell\n",
    "from IPython.core.interactiveshell import InteractiveShell\n",
    "InteractiveShell.ast_node_interactivity = \"all\"\n",
    "\n",
    "# auto reload external Python modules\n",
    "%load_ext autoreload\n",
    "%autoreload 2\n",
    "\n",
    "# display Matplotlib figures inline and set default size\n",
    "import matplotlib.pyplot as plt\n",
    "%matplotlib inline\n",
    "plt.rcParams['figure.dpi'] = 120\n",
    "plt.rcParams['figure.figsize'] = (8.0, 3.0)\n",
    "plt.rcParams['image.interpolation'] = 'nearest'\n",
    "plt.rcParams['image.cmap'] = 'gray'\n",
    "plt.rcParams['font.size'] = 9"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# configurations and paths\n",
    "output_dir_path = '/home/yasiyu/yasiyu_temp'\n",
    "\n",
    "deployments_path = '/datadrive/emammal'\n",
    "\n",
    "projects_of_interest = ['p139', 'p158']  # the Robert Long collection had deployents from these two projects"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Prepare an iMerit job\n",
    "\n",
    "This notebooks samples a subset of the Robert Long collection of eMammal data from the Seattle area, and \n",
    "copies them to a folder to hand off for annotations to iMerit.\n",
    "\n",
    "Aim to have a sample of ~20k images:\n",
    "- exclude empty sequences\n",
    "- do not include too many humans\n",
    "- reasonable diversity across species\n",
    "- reasonable diversity across deployments\n",
    "- balance between day/night (time of day is okay for this purpose)\n",
    "- for simplicity, may avoid sequences with multiple _different_ species, since we'll use these labels for classification as well.\n",
    "- could bias towards species that the GIX project needs\n",
    "\n",
    "Image name format:\n",
    "\n",
    "`dataset[datasetname].project[projectID].deployment[deploymentID].seq[sequenceID].frame[frameNumber].img[imageID].jpg\n",
    "`\n",
    "\n",
    "e.g.\n",
    "\n",
    "`datasetemammal.project3062.deploymentd16546.seqd16546s14.frame001.imgd16546s14i1.jpg`"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# find the deployments in the Robert Long collection\n",
    "\n",
    "deployments = []\n",
    "for deployment in os.listdir(deployments_path):\n",
    "    # we know that for deployments in the Robert Long collection, the folders are prefixed with the \n",
    "    # project ID, but in general this is not true.\n",
    "    of_interest = False\n",
    "    for proj in projects_of_interest:\n",
    "        if deployment.startswith(proj):\n",
    "            of_interest = True\n",
    "    if of_interest:\n",
    "        deployments.append(deployment)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "len(deployments)  # confirmed by using Storage Explorer that the 0Roboert Long collection has 126 zips indeed."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Make a dataframe, one row each sequence, columns include\n",
    "\n",
    "`full_seq_id, project_id, deployment_id, species, is_empty, is_human_only, is_multi_species, num_frames, start_time, is_daytime`\n",
    "\n",
    "species is semi-column separated list of unique species, sorted.\n",
    "\n",
    "Make another dataframe for images, one row per image, columns include\n",
    "\n",
    "`full_img_id, full_img_path, full_seq_id, project_id, project_name, deployment_id, site_name, species, img_id`\n",
    "\n",
    "Note that `full_img_id` is generated from information in the xml file only, and you might not be able to construct `full_img_path` from it (hence the need for this mapping) because of errors in naming the folder structures.\n",
    "\n",
    "Checked that all `full_img_path` exist on the data disk. Some images may be corrupted from previous experience.  "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sequences_list = []\n",
    "images_list = []\n",
    "species_tally = defaultdict(int)\n",
    "\n",
    "# is the species distribution different for sequences with no more than 20 frames\n",
    "species_tally_short_seq = defaultdict(int)\n",
    "max_num_frames = 20\n",
    "\n",
    "for deployment in tqdm(deployments):\n",
    "    deployment_path = os.path.join(deployments_path, deployment)\n",
    "    manifest_path = os.path.join(deployment_path, 'deployment_manifest.xml')\n",
    "    \n",
    "    with open(manifest_path, 'r') as f:\n",
    "        tree = etree.parse(f)\n",
    "    \n",
    "    root = tree.getroot()\n",
    "    project_id = root.findtext('ProjectId')\n",
    "    project_name = root.findtext('ProjectName')\n",
    "    deployment_id = root.findtext('CameraDeploymentID')\n",
    "    site_name = root.findtext('CameraSiteName')\n",
    "    \n",
    "    image_sequences = root.findall('ImageSequence')\n",
    "    for sequence in image_sequences:\n",
    "        images = sequence.findall('Image')\n",
    "        num_frames = len(images)\n",
    "        \n",
    "        # sequences\n",
    "        seq_id = sequence.findtext('ImageSequenceId')\n",
    "        full_seq_id = 'datasetemammal.project{}.deployment{}.seq{}'.format(project_id, deployment_id, seq_id)\n",
    "        \n",
    "        # get species info\n",
    "        researcher_identifications = sequence.findall('ResearcherIdentifications')\n",
    "        species_in_seq = set()\n",
    "        for researcher_id in researcher_identifications:\n",
    "            identifications = researcher_id.findall('Identification')\n",
    "            for id in identifications:\n",
    "                species_common_name = clean_species_name(id.findtext('SpeciesCommonName'))\n",
    "                species_tally[species_common_name] += 1\n",
    "                species_in_seq.add(species_common_name)\n",
    "                \n",
    "                if num_frames <= max_num_frames:\n",
    "                    species_tally_short_seq[species_common_name] += 1\n",
    "                \n",
    "        species_str = ';'.join(sorted(list(species_in_seq)))\n",
    "    \n",
    "        seq_start_time = sequence.findtext('ImageSequenceBeginTime')\n",
    "        start_time = parse_timestamp(seq_start_time)\n",
    "    \n",
    "        sequences_list.append({\n",
    "            'full_seq_id': full_seq_id,\n",
    "            'project_id': project_id,\n",
    "            'deployment_id': deployment_id,\n",
    "            'species': species_str,\n",
    "            'is_empty': True if species_str == 'empty' else False,\n",
    "            'is_human_only': True if species_str == 'human' else False,\n",
    "            'is_multi_species': True if len(species_in_seq) > 1 else False,\n",
    "            'num_frames': num_frames,\n",
    "            'start_time': start_time,\n",
    "            'is_daytime': is_daytime(start_time)\n",
    "        })\n",
    "        \n",
    "        # images\n",
    "        for img in images:\n",
    "            img_id = img.findtext('ImageId')\n",
    "            img_file_name = img.findtext('ImageFileName')\n",
    "            assert img_file_name.endswith('.JPG')\n",
    "            try:\n",
    "                img_frame = clean_frame_number_4_digit(img.findtext('ImageOrder'))\n",
    "            except:\n",
    "                img_frame = img_file_name.split('i')[1].split('.')[0]\n",
    "                img_frame = clean_frame_number_4_digit(img_frame)\n",
    "            \n",
    "            full_img_id = 'datasetemammal.project{}.deployment{}.seq{}.frame{}.img{}'.format(project_id, deployment_id, seq_id, img_frame, img_id)\n",
    "            full_img_path = os.path.join(deployment_path, img_file_name)\n",
    "            \n",
    "            if not os.path.exists(full_img_path):\n",
    "                print('Path does not exist: {}'.format(full_img_path))\n",
    "            \n",
    "            images_list.append({\n",
    "                'full_img_id': full_img_id,\n",
    "                'full_img_path': full_img_path,\n",
    "                'full_seq_id': full_seq_id,\n",
    "                'project_id': project_id,\n",
    "                'project_name': project_name,\n",
    "                'deployment_id': deployment_id,\n",
    "                'site_name': site_name,\n",
    "                'species': species_str,\n",
    "                'img_id': img_id\n",
    "            })"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Images in deployements `p139d18649` only have four attributes:\n",
    "\n",
    "```\n",
    "<Element Image at 0x7f0feeb07388>\n",
    "<Element ImageId at 0x7f0feeb07288>\n",
    "<Element ImageFileName at 0x7f0feeb07508>\n",
    "<Element ImageInterestRanking at 0x7f0feebbf708>\n",
    "```\n",
    "\n",
    "Their frame number are assigned based on the image's file name."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "len(sequences_list)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "len(images_list)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Species distribution in the 0Robert Long WA collection"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "plot_distribution(species_tally, title='Number of sequences with the species', top=30)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "species_tally_animals = copy.deepcopy(species_tally)\n",
    "species_tally_animals['human'] = 0\n",
    "species_tally_animals['empty'] = 0\n",
    "species_tally_animals['domestic dog'] = 0\n",
    "plot_distribution(species_tally_animals, title='Number of sequences with the species, excl. human, empty, dog', top=30)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Species distribution in the 0Robert Long WA collection, among sequences with 20 frames or fewer"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "plot_distribution(species_tally_short_seq, title='Number of sequences with the species, num_frames <= 20', top=30)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "species_tally_animals = copy.deepcopy(species_tally_short_seq)\n",
    "species_tally_animals['human'] = 0\n",
    "species_tally_animals['empty'] = 0\n",
    "species_tally_animals['domestic dog'] = 0\n",
    "plot_distribution(species_tally_animals, title='Number of sequences with the species, excl. human, empty, dog, num_frames <= 20', top=30)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Understand the distribution of species and number of frames"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_seq = pd.DataFrame(sequences_list)\n",
    "df_img = pd.DataFrame(images_list)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "len(df_seq)\n",
    "df_seq.dtypes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "len(df_img)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_seq.sample(n=5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_img.sample(n=3)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Refine the sequences we want to label"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_seq_animals = df_seq[(df_seq.is_empty == False) & (df_seq.is_human_only == False)]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_seq_animals"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### What is the distribution of number of frames in sequences, in the sequences excluding empty, human-only and dog-walking?"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "num_frames = df_seq_no_dog_walking.num_frames\n",
    "statistics.mean(num_frames)\n",
    "statistics.median(num_frames)\n",
    "max(num_frames)\n",
    "min(num_frames)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "plot_histogram(num_frames)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# how many of these frames are more than 20 long? - only about 5%\n",
    "sum(num > 20 for num in num_frames) / len(num_frames)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Select the sequences to be annotated\n",
    "#### Start from no human-only and non-empty sequences. Exclude sequences longer than 20 frames, and exclude sequences with multiple species"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_seq_short = df_seq_animals[(df_seq_animals.num_frames <= 20)]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sum(df_seq_animals.num_frames)  # number of images\n",
    "sum(df_seq_short.num_frames)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Separate out dog-walking, horse-riding, coyote/mule deer (two most common non-domestic species) sequences"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_seq_dog_walking = df_seq_short[df_seq_short.species == 'domestic dog;human']\n",
    "len(df_seq_dog_walking)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_seq_horse_riding = df_seq_short[df_seq_short.species == 'domestic horse;human']\n",
    "len(df_seq_horse_riding)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_seq_coyote_deer = df_seq_short[(df_seq_short.species == 'coyote') | (df_seq_short.species == 'mule deer')]\n",
    "len(df_seq_coyote_deer)\n",
    "df_seq_coyote_deer.sample(n=3)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Get ride of all multi-species (not that many after excluding dog walking and horse riding ones) sequences"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def_single_species = df_seq_short[df_seq_short.is_multi_species == False]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_seq_rare = def_single_species[(def_single_species.species != 'coyote') & (def_single_species.species != 'mule deer')]\n",
    "len(df_seq_rare)\n",
    "df_seq_rare.sample(n=5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_seq_coyote_deer_sample = df_seq_coyote_deer.sample(n=800)\n",
    "len(df_seq_coyote_deer_sample)\n",
    "df_seq_coyote_deer_sample.sample(n=3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_seq_good = pd.concat([df_seq_rare, df_seq_coyote_deer_sample])\n",
    "print('Total number of sequences: {}'.format(len(df_seq_good)))\n",
    "print('Total number of images: {}'.format(sum(df_seq_good.num_frames)))\n",
    "df_seq_good.sample(n=3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "final_species = defaultdict(int)\n",
    "for sp in df_seq_good.species:\n",
    "    final_species[sp] += 1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "plot_distribution(final_species, title='Species in the resampled dataset', top=30)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Save a record of the sequences selected for relabeling \n",
    "Note that since the coyote and mule deer entries are sampled randomly, running this script again will NOT generate the same sequences"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_seq_good.to_csv('/home/yasiyu/yasiyu_temp/0RobertLong_sequences_20180907.csv', index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# need to load df_seq_good from csv if you need to find the images in these sequences again."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Save a record of the images in the selected sequences"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "img_selected_paths = []\n",
    "list_df_images_in_seq = []\n",
    "for full_seq_id in tqdm(df_seq_good.full_seq_id):\n",
    "    images_in_seq = df_img[df_img.full_seq_id == full_seq_id]\n",
    "    \n",
    "    pairs = zip(list(images_in_seq.full_img_path), list(images_in_seq.full_img_id))\n",
    "    img_selected_paths.extend(pairs)\n",
    "    \n",
    "    list_df_images_in_seq.append(images_in_seq)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "len(img_selected_paths)  # should be 14867"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_img_good = pd.concat(list_df_images_in_seq)\n",
    "len(df_img_good)\n",
    "df_img_good.sample(n=3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_img_good.to_csv('/home/yasiyu/yasiyu_temp/0RobertLong_images_20180907.csv', index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "len(df_img_good.deployment_id.unique())  # number of deployments selected, out of a total of 126, so pretty good representation"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Copy out the images into a folder\n",
    "\n",
    "Need to rename the images to the full_img_id"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "img_selected_paths[14000]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "dest_folder = '/home/yasiyu/yasiyu_temp/eMammal_20180907'\n",
    "for from_path, full_img_id in tqdm(img_selected_paths):\n",
    "    dest_path = os.path.join(dest_folder, '{}.jpg'.format(full_img_id))\n",
    "    res = shutil.copyfile(from_path, dest_path)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# zip the folder"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# upload to blob storage - take a while\n",
    "key = os.environ[\"AZ_STORAGE_KEY\"]\n",
    "blob_service = BlobServiceClient(account_url='wildlifeblobssc.blob.core.windows.net', credential=key)\n",
    "with open('/home/yasiyu/yasiyu_temp/eMammal_20180907.zip', 'rb') as f:\n",
    "    blob_service.get_container_client('yasiyutemp').upload_blob(name='eMammal_20180907.zip', data=f)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Number of daytime/nighttime images"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "num_images_night = sum(df_seq_good[df_seq_good.is_daytime == False].num_frames) / sum(df_seq_good.num_frames)\n",
    "num_images_night"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "But if we take into account of all images including people walking dogs, there are now fewer nighttime pictures which makes sense since animals only come out after dark on trails / in general."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "num_images_night_tot = sum(df_seq[df_seq.is_daytime == False].num_frames) / sum(df_seq.num_frames)\n",
    "num_images_night_tot"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python [conda env:py35]",
   "language": "python",
   "name": "conda-env-py35-py"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
